Quality control and filtering results from cellranger

Sample info and environment setup

PRJNA732205

setwd("/media/jacopo/Elements/re_align/MM/PRJNA732205/SAMN19314104/SRR14629340/")
# Load the libraries (from Sarah script + biomart)
library(tidyverse) # packages for data wrangling, visualization etc
library(Seurat) # scRNA-Seq analysis package
library(clustree) # plot of clustering tree 
library(ggsignif) # Enrich your 'ggplots' with group-wise comparisons
library(clusterProfiler) #The package implements methods to analyze and visualize functional profiles of gene and gene clusters.
library(org.Hs.eg.db) # Human annotation package neede for clusterProfiler
library(ggrepel) # extra geoms for ggplo2
library(patchwork) #multiplots
library(reticulate)

Load and process cellranger data

Load and do the QC for the cellranger data

#list.files(".")
dat <- Read10X(data.dir ="./out/counts_filtered/")
dat <- CreateSeuratObject(dat) # Create the seurat object from the 10x data
kb.initial <- dat@assays[["RNA"]]@counts@Dim[[2]]
cat("Initial number of cells:", kb.initial, 
    "\nNumber of genes:",  dat@assays[["RNA"]]@counts@Dim[[1]])
## Initial number of cells: 10310 
## Number of genes: 36601

Quality Control

Empty cells were already filtered, check for % mt RNA and death markers:

# first calculate the mitochondrial percentage for each cell
dat$percent_mt <- PercentageFeatureSet(dat, pattern="^MT.")
# make violin plots
mt_rna = 20
max_counts = 15000



# Check some feature-feature relationships
# % mt RNA vs n Counts, n Features vs n Counts
# Check some feature-feature relationships
# % mt RNA vs n Counts, n Features vs n Counts
VlnPlot(dat, features = c("nCount_RNA", "nFeature_RNA", "percent_mt"))  + geom_hline(yintercept=mt_rna, linetype = "dotted")

plot1 <- FeatureScatter(dat, feature1 = "nCount_RNA", feature2 = "percent_mt")
plot1 <- plot1 + geom_hline(yintercept=mt_rna, linetype = "dotted")
plot2 <- FeatureScatter(dat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot2 <- plot2 + geom_vline(xintercept = max_counts, linetype = "dotted")
plot1 

plot2

##  cells retained by mt RNA content ( 20 %): 10221 
##  percentage of retained cells: 99.14 %
## cells retained by counts ( 15000 ): 10203 
##  percentage of retained cells: 98.96 %

Check the distribution of the cells with low counts and control death markers:

min_counts = 200


hist(dat@meta.data$nCount_RNA, breaks = 100, xlab = "Counts")

hist(dat@meta.data$nCount_RNA, breaks = 1000, xlab = "Counts", xlim = c(0,5000))

hist(dat@meta.data$nCount_RNA, breaks = 10000, xlab = "Counts", xlim = c(0,1000))
abline(v=min_counts, col="red", lty = 3)

The evident peak of cells with < 200 counts could contain dying cells.

# Subset the dataset to focus only on those cells with low counts
dat.lowcount <- subset(dat, subset = nCount_RNA < min_counts)

# Get the mean of the counts for each gene and sort them decreasing
meanCounts <- rowMeans(GetAssayData(object = dat.lowcount, slot = 'counts'))
meanCounts <- sort(meanCounts, decreasing = T)

# A boxplot can help to observe the distribution of the means
#boxplot(meanCounts)

# Print the most highly expressed genes
head(meanCounts, 30)
##       IGKC        HBB     MALAT1     JCHAIN       HBA2      IGHA1        B2M 
## 23.3448276  5.7068966  5.1034483  3.6034483  3.2241379  2.1551724  2.0517241 
##     EEF1A1      RPS27      RPLP1      RPL10      RPL41       FTH1      RPL13 
##  1.2758621  1.0344828  0.9827586  0.9655172  0.9310345  0.8448276  0.8448276 
##      RPL32      RPS18     RPS15A      RPL34      RPS12     RPL13A     MT-ND2 
##  0.8103448  0.7931034  0.7241379  0.7068966  0.6551724  0.6551724  0.6551724 
##      HLA-B      RPS4X       HBA1        FTL      RPS23       RPS6      RPS25 
##  0.6206897  0.6034483  0.5862069  0.5689655  0.5517241  0.5517241  0.5517241 
##       EIF1     RPS27A 
##  0.5517241  0.5172414
## cells retained by counts ( 200 ): 10145 
##  percentage of retained cells: 98.4 %

dir.create("result")
saveRDS(dat, file = "./result/SAMN19314104_clean_QC.Rds")

Feature selection

#Normalize
dat <- NormalizeData(dat)
# Find the first 4000 variabe features
dat <- FindVariableFeatures(dat, selection.method = "vst", nfeatures = 4000)

Data scaling

Set mean expression to 0 and variance across 1 to avoid highly expressed genes drive the forwarding analyses. Since negative expression is meaningless, scaled data are useful only for UMAP and clustering

# scale data, the scaled data are saved in:
# dat[["RNA"]]@scale.data

all.genes <- rownames(dat)

dat <- ScaleData(dat, vars.to.regress = c("percent_mt","nCount_RNA"))

Dimensionality reduction

dat <- RunPCA(dat, features = VariableFeatures(object = dat), verbose = F, seed.use = 1)
print(dat[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  JCHAIN, SSR4, CCDC144A, SEC11C, IGKC 
## Negative:  BIRC5, TYMS, STMN1, PCLAF, ZWINT 
## PC_ 2 
## Positive:  CD74, TMSB10, CTSH, RPL36, LTB 
## Negative:  IGKC, IGHA1, SSR4, HLA-C, JUN 
## PC_ 3 
## Positive:  RPS12, HLA-E, RPL12, RPL10, RPL36 
## Negative:  IGKC, CTSH, PLEK, DSCR8, ADTRP 
## PC_ 4 
## Positive:  LTB, S100A4, RPL18A, CCR7, HOPX 
## Negative:  FTH1, CCPG1, SEC11C, CCDC144A, TRIB3 
## PC_ 5 
## Positive:  JCHAIN, RPL36, RPL12, RPS12, GAS5 
## Negative:  KLF6, FRZB, CCND1, IGKC, TSC22D3

UMAP

UMAP is a graph-based method of clustering. The first step in this process is to construct a KNN graph based on the euclidean distance in PCA space:

dat <- FindNeighbors(dat, dims = 1:20)

The graph now can be used as input for the function runUMAP()

dat <- RunUMAP(dat, dims = 1:20, seed.use = 1)
DimPlot(dat, reduction = 'umap', seed = 1)

Final plots:

## QC metrics

## markers